The Center for World University Rankings (CWUR) 에서 제공하는 전세계 대학 순위 정보. (https://cwur.org/) 2012년 ~2018년 대학별 순위, 국내순위, 교육의 질, 졸업생 취업, 단과대학의 질, 출판물, 영향력, 인용, 특허, 총 점수 등의 정보를 제공.
CWUR uses seven objective and robust indicators to rank the world’s top 1000 universities:
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
data <- read.csv("data/cwur_data.csv", stringsAsFactors = F)
str(data)
## 'data.frame': 5200 obs. of 12 variables:
## $ world_rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ institution : chr "Harvard University" "Massachusetts Institute of Technology" "Stanford University" "University of Cambridge" ...
## $ country : chr "USA" "USA" "USA" "United Kingdom" ...
## $ national_rank : int 1 2 3 1 4 5 2 6 7 8 ...
## $ quality_of_education: int 7 9 17 10 2 8 13 14 23 16 ...
## $ alumni_employment : int 9 17 11 24 29 14 28 31 21 52 ...
## $ quality_of_faculty : int 1 3 5 4 7 2 9 12 10 6 ...
## $ publications : int 1 12 4 16 37 53 15 14 13 6 ...
## $ influence : int 1 4 2 16 22 33 13 6 12 5 ...
## $ citations : int 1 4 2 11 22 26 19 15 14 3 ...
## $ score : num 100 91.7 89.5 86.2 85.2 ...
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
sapply(data, function(x){sum(is.na(x))})
## world_rank institution country
## 0 0 0
## national_rank quality_of_education alumni_employment
## 0 597 0
## quality_of_faculty publications influence
## 731 0 0
## citations score year
## 0 0 0
#Hmisc::describe(data)
# 범주형 변수인 2열과 3열을 제외함
data1 <- data[, -c(2,3)]
# 결측치가 존재하는 2018년 데이터를 제외함
data1 <-data1[data1$year != 2018, ]
# 결측치 갯수 확인
sum(is.na(data1))
## [1] 0
str(data1)
## 'data.frame': 4200 obs. of 10 variables:
## $ world_rank : int 1 2 3 4 5 6 7 8 9 10 ...
## $ national_rank : int 1 2 3 1 4 5 2 6 7 8 ...
## $ quality_of_education: int 7 9 17 10 2 8 13 14 23 16 ...
## $ alumni_employment : int 9 17 11 24 29 14 28 31 21 52 ...
## $ quality_of_faculty : int 1 3 5 4 7 2 9 12 10 6 ...
## $ publications : int 1 12 4 16 37 53 15 14 13 6 ...
## $ influence : int 1 4 2 16 22 33 13 6 12 5 ...
## $ citations : int 1 4 2 11 22 26 19 15 14 3 ...
## $ score : num 100 91.7 89.5 86.2 85.2 ...
## $ year : int 2012 2012 2012 2012 2012 2012 2012 2012 2012 2012 ...
# dplyr 로 동일하게 표현하면..
data1 <- data %>%
select(-c(2,3)) %>%
filter(year != 2018)
# 상관계수 출력
cor(data1)
## world_rank national_rank quality_of_education
## world_rank 1.0000000 0.23832107 0.6343275
## national_rank 0.2383211 1.00000000 0.1853993
## quality_of_education 0.6343275 0.18539934 1.0000000
## alumni_employment 0.6289032 0.10561614 0.5572212
## quality_of_faculty 0.6249865 0.19657009 0.7491310
## publications 0.9170506 0.32654389 0.5762741
## influence 0.8928859 0.16228501 0.6074201
## citations 0.8444044 0.19559639 0.5826268
## score -0.5491533 -0.19105680 -0.5967928
## year 0.1620879 0.04881068 0.2587159
## alumni_employment quality_of_faculty publications
## world_rank 0.6289032 0.6249865 0.9170506
## national_rank 0.1056161 0.1965701 0.3265439
## quality_of_education 0.5572212 0.7491310 0.5762741
## alumni_employment 1.0000000 0.5005800 0.5318997
## quality_of_faculty 0.5005800 1.0000000 0.5907925
## publications 0.5318997 0.5907925 1.0000000
## influence 0.4780830 0.6224009 0.8626805
## citations 0.5053959 0.6121459 0.8105269
## score -0.4941307 -0.6955724 -0.5199695
## year 0.2744686 0.3609531 0.1606745
## influence citations score year
## world_rank 0.8928859 0.8444044 -0.5491533 0.16208788
## national_rank 0.1622850 0.1955964 -0.1910568 0.04881068
## quality_of_education 0.6074201 0.5826268 -0.5967928 0.25871591
## alumni_employment 0.4780830 0.5053959 -0.4941307 0.27446860
## quality_of_faculty 0.6224009 0.6121459 -0.6955724 0.36095308
## publications 0.8626805 0.8105269 -0.5199695 0.16067453
## influence 1.0000000 0.8280916 -0.5223281 0.16077842
## citations 0.8280916 1.0000000 -0.5189880 0.17755460
## score -0.5223281 -0.5189880 1.0000000 -0.20910887
## year 0.1607784 0.1775546 -0.2091089 1.00000000
# 상관계수를 소수점 두자리로 출력
round(cor(data1), 2)
## world_rank national_rank quality_of_education
## world_rank 1.00 0.24 0.63
## national_rank 0.24 1.00 0.19
## quality_of_education 0.63 0.19 1.00
## alumni_employment 0.63 0.11 0.56
## quality_of_faculty 0.62 0.20 0.75
## publications 0.92 0.33 0.58
## influence 0.89 0.16 0.61
## citations 0.84 0.20 0.58
## score -0.55 -0.19 -0.60
## year 0.16 0.05 0.26
## alumni_employment quality_of_faculty publications
## world_rank 0.63 0.62 0.92
## national_rank 0.11 0.20 0.33
## quality_of_education 0.56 0.75 0.58
## alumni_employment 1.00 0.50 0.53
## quality_of_faculty 0.50 1.00 0.59
## publications 0.53 0.59 1.00
## influence 0.48 0.62 0.86
## citations 0.51 0.61 0.81
## score -0.49 -0.70 -0.52
## year 0.27 0.36 0.16
## influence citations score year
## world_rank 0.89 0.84 -0.55 0.16
## national_rank 0.16 0.20 -0.19 0.05
## quality_of_education 0.61 0.58 -0.60 0.26
## alumni_employment 0.48 0.51 -0.49 0.27
## quality_of_faculty 0.62 0.61 -0.70 0.36
## publications 0.86 0.81 -0.52 0.16
## influence 1.00 0.83 -0.52 0.16
## citations 0.83 1.00 -0.52 0.18
## score -0.52 -0.52 1.00 -0.21
## year 0.16 0.18 -0.21 1.00
R corrplot function is used to plot the graph of the correlation matrix.
The simplified format of the function is :
library(corrplot)
## corrplot 0.84 loaded
corr <- cor(data1)
corrplot(corr, method="circle")
Seven different visualization methods can be used : “circle”, “square”, “ellipse”, “number”, “shade”, “color”, “pie”.
corrplot(corr, method="pie")
corrplot(corr, method="color")
corrplot(corr, method="number") # Display the correlation coefficient
There are three types of layout :
corrplot(corr, type="upper")
corrplot(corr, type="lower")
The correlation matrix can be reordered according to the correlation coefficient. This is important to identify the hidden structure and pattern in the matrix. “hclust” for hierarchical clustering order is used in the following examples.
# correlogram with hclust reordering
corrplot(corr, type="upper", order="hclust")
As shown in the above section, the color of the correlogram can be customized. RcolorBrewer palette of colors are used in the R script below :
library(RColorBrewer)
corrplot(corr, type="upper", order="hclust",
col=brewer.pal(n=8, name="PuOr"))
As shown in the above section, the color of the correlogram can be customized. RcolorBrewer palette of colors are used in the R script below :
corrplot(corr, type="upper", tl.col="black", tl.srt=45)
data <- tbl_df(data)
data
## # A tibble: 5,200 x 12
## world_rank institution country national_rank quality_of_educa…
## <int> <chr> <chr> <int> <int>
## 1 1 Harvard University USA 1 7
## 2 2 Massachusetts Inst… USA 2 9
## 3 3 Stanford University USA 3 17
## 4 4 University of Camb… United … 1 10
## 5 5 California Institu… USA 4 2
## 6 6 Princeton Universi… USA 5 8
## 7 7 University of Oxfo… United … 2 13
## 8 8 Yale University USA 6 14
## 9 9 Columbia University USA 7 23
## 10 10 University of Cali… USA 8 16
## # ... with 5,190 more rows, and 7 more variables: alumni_employment <int>,
## # quality_of_faculty <int>, publications <int>, influence <int>,
## # citations <int>, score <dbl>, year <int>
dplyr가 제공하는 기능 중 특별한 건 아니고, 크기가 큰 데이터를 실수로 실행하게 되면, 모든 데이터가 console에 출력되면서 시간이 오래 걸리는데, 이를 방지하기 위해 데이터 일부만 보여주는 기능
filter <- dplyr::filter
select <- dplyr::select
filter(data, world_rank == 1)
## # A tibble: 7 x 12
## world_rank institution country national_rank quality_of_education
## <int> <chr> <chr> <int> <int>
## 1 1 Harvard University USA 1 7
## 2 1 Harvard University USA 1 1
## 3 1 Harvard University USA 1 1
## 4 1 Harvard University USA 1 1
## 5 1 Harvard University USA 1 1
## 6 1 Harvard University USA 1 1
## 7 1 Harvard University USA 1 2
## # ... with 7 more variables: alumni_employment <int>,
## # quality_of_faculty <int>, publications <int>, influence <int>,
## # citations <int>, score <dbl>, year <int>
filter(data, country == "South Korea", world_rank < 100)
## # A tibble: 8 x 12
## world_rank institution country national_rank quality_of_educat…
## <int> <chr> <chr> <int> <int>
## 1 75 Seoul National Un… South Ko… 1 101
## 2 40 Seoul National Un… South Ko… 1 101
## 3 24 Seoul National Un… South Ko… 1 355
## 4 24 Seoul National Un… South Ko… 1 367
## 5 98 Yonsei University South Ko… 2 367
## 6 24 Seoul National Un… South Ko… 1 378
## 7 23 Seoul National Un… South Ko… 1 383
## 8 60 Seoul National Un… South Ko… 1 NA
## # ... with 7 more variables: alumni_employment <int>,
## # quality_of_faculty <int>, publications <int>, influence <int>,
## # citations <int>, score <dbl>, year <int>
# 결측치 합계 확인
sum(is.na(data))
## [1] 1328
# 변수별 결측치 갯수(합계) 확인
sapply(data, function(x){sum(is.na(x))})
## world_rank institution country
## 0 0 0
## national_rank quality_of_education alumni_employment
## 0 597 0
## quality_of_faculty publications influence
## 731 0 0
## citations score year
## 0 0 0
gdata <- group_by(data, year)
clean_data <- data %>% filter(year != 2018)
sapply(clean_data, function(x){sum(is.na(x))})
## world_rank institution country
## 0 0 0
## national_rank quality_of_education alumni_employment
## 0 0 0
## quality_of_faculty publications influence
## 0 0 0
## citations score year
## 0 0 0
data %>% select(country)
## # A tibble: 5,200 x 1
## country
## <chr>
## 1 USA
## 2 USA
## 3 USA
## 4 United Kingdom
## 5 USA
## 6 USA
## 7 United Kingdom
## 8 USA
## 9 USA
## 10 USA
## # ... with 5,190 more rows
data %>% select(quality_of_education:citations)
## # A tibble: 5,200 x 6
## quality_of_education alumni_employment quality_of_faculty publications
## <int> <int> <int> <int>
## 1 7 9 1 1
## 2 9 17 3 12
## 3 17 11 5 4
## 4 10 24 4 16
## 5 2 29 7 37
## 6 8 14 2 53
## 7 13 28 9 15
## 8 14 31 12 14
## 9 23 21 10 13
## 10 16 52 6 6
## # ... with 5,190 more rows, and 2 more variables: influence <int>,
## # citations <int>
data %>%
group_by(year) %>%
select(year, institution, world_rank) %>%
top_n(-5, wt = world_rank) %>%
ggplot(aes(x = year, y = world_rank, group = institution)) +
geom_line(aes(color=institution)) +
geom_point(aes(shape=institution, color=institution)) +
theme_bw() +
labs(x="Year", y="World Rank",
title="World Ranks (2012-2018)",
subtitle="Best World ranked Universities by CWUR")
# data %>%
# select(world_rank, institution, year) %>%
# filter(institution == "California Institute of Technology")
data %>% group_by(country,year) %>%
summarise(nr = length(world_rank), minw=min(world_rank), maxw=max(world_rank), avgw=round(mean(world_rank),0)) %>%
select(country, year, nr, minw, maxw, avgw) %>%
ungroup() -> ccwur
# light grey boundaries
#l <- list(color = toRGB("grey"), width = 0.5)
ccwur$hover <- with(ccwur,
paste("Country: ", country, '<br>',
"Year: ",year, "<br>",
"Universities in top: ", nr, "<br>",
"Min rank in top: ", minw, "<br>",
"Max rank in top: ", maxw, "<br>",
"Mean rank in top: ", avgw,"<br>"
))
# specify map projection/options
g <- list(
showframe = TRUE,
showcoastlines = TRUE,
projection = list(type = 'orthogonal')
)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_geo(ccwur, locationmode = 'country names') %>%
add_trace(
z = ~nr, color = ~nr, colors = 'Spectral', frame = ~year,
text = ~hover, locations=~country) %>%
colorbar(title = 'Number of\nuniversities in top', tickprefix = '') %>%
layout(
title = with(ccwur, paste('Number of universities in top')),
geo = g
)